
rm(list=ls())
## load tsv data table

require(data.table)
data<- data.frame(fread("Taware/meta_760_original.tsv"))

## 1.  remove metabolites with unknown chebi ID
data_chebi = data[sapply(data[,1], function(x) grepl("CHEBI", x) ),]

## 2.  remove columns with NAs
data_nonna = data_chebi[,!apply(is.na(data_chebi), 2, any)]

## 3. remove metabolites with >=50% zeros
data_metab = apply(data_nonna[,11:102], c(1,2), as.numeric)
metab_delete = rowMeans(data_metab==0) >= 0.5
data_760 = data_nonna[!metab_delete,]


## 4. save the clean data
library(xlsx)
write.xlsx(data_760,"Taware/meta_760.xlsx")

## save chebi IDs for generating pathways
chebi_760 = data_760$database_identifier
write.table(chebi_760, file = "Taware/chebi_760.txt",row.names = F,col.names = F, quote = FALSE)

## 5. save the data matrix
data_matrix = t(data_metab[!metab_delete,])
rownames(data_matrix)
colnames(data_matrix) = data_760$metabolite_identification

## 6. normalized the data
hist(data_matrix)
if(min(data_matrix)>0){
  data_matrix_normal = log2(data_matrix)
  hist(data_matrix_normal)
} else{
  a = 1e+5 ## default value, could be other values 
  data_matrix_normal = log2((data_matrix + sqrt(data_matrix^2 + a^2))/2)
  hist(data_matrix_normal)
}

y = as.numeric(grepl("HCS",rownames(data_matrix_normal)))
d760 = cbind(data_matrix_normal, y) 
identical(colnames(data_matrix_normal), data_760$metabolite_identification)

save(d760, file="Taware/d760.RData")



############################   ========================= smpdb_HMDB_pathway =========================   ############################ 

smpdbhmdb <- data.frame(fread("Taware/pathwaydatabase760/mbrole2_smpdbhmdb.tsv"))
smpdbhmdbpath_chebi = smpdbhmdb$Submitted.IDs
names(smpdbhmdbpath_chebi) = smpdbhmdb$ID.Annotation
listsmpdbhmdbpath = lapply(smpdbhmdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listsmpdbhmdbpath_uniq = listsmpdbhmdbpath[which(!duplicated(listsmpdbhmdbpath))]

## map cheiID to metabolites identifies
smpdbhmdbpath <- lapply(listsmpdbhmdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_760$metabolite_identification[which(chebi_760 %in% x )] )

smpdbhmdbpath_metab = smpdbhmdbpath[which(!duplicated(smpdbhmdbpath))]

save(smpdbhmdbpath_metab, file="Taware/pathwaydatabase760/smpdbhmdbpath_metab_760.RData")




############################   ========================= Biocycpathway =========================   ############################ 

biocyc <- data.frame(fread("Taware/pathwaydatabase760/mbrole2_biocyc.tsv"))
biocycpath_chebi = biocyc$Submitted.IDs
names(biocycpath_chebi) = biocyc$ID.Annotation
listbiocycpath = lapply(biocycpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listbiocycpath_uniq = listbiocycpath[which(!duplicated(listbiocycpath))]

## map cheiID to metabolites identifies
biocycpath <- lapply(listbiocycpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_760$metabolite_identification[which(chebi_760 %in% x )] )

biocycpath_metab = biocycpath[which(!duplicated(biocycpath))]

save(biocycpath_metab, file="Taware/pathwaydatabase760/biocycpath_metab_760.RData")



############################   ========================= keggpathway =========================   ############################ 

kegg<- data.frame(fread("Taware/pathwaydatabase760/mbrole2_kegg.tsv"))
keggpath_chebi = kegg$Submitted.IDs
names(keggpath_chebi) = kegg$ID.Annotation
listkeggpath = lapply(keggpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listkeggpath_uniq = listkeggpath[which(!duplicated(listkeggpath))]

## map cheiID to metabolites identifies
keggpath <- lapply(listkeggpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_760$metabolite_identification[which(chebi_760 %in% x )] )

keggpath_metab = keggpath[which(!duplicated(keggpath))]

save(keggpath_metab, file="Taware/pathwaydatabase760/keggpath_metab_760.RData")


############################   ========================= proteinhmdb =========================   ############################ 

proteinhmdb<- data.frame(fread("Taware/pathwaydatabase760/mbrole2_proteinhmdb.tsv"))
proteinhmdbpath_chebi = proteinhmdb$Submitted.IDs
names(proteinhmdbpath_chebi) = proteinhmdb$ID.Annotation
listproteinhmdbpath = lapply(proteinhmdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listproteinhmdbpath_uniq = listproteinhmdbpath[which(!duplicated(listproteinhmdbpath))]

## map cheiID to metabolites identifies
proteinhmdbpath <- lapply(listproteinhmdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_760$metabolite_identification[which(chebi_760 %in% x )] )

proteinhmdbpath_metab = proteinhmdbpath[which(!duplicated(proteinhmdbpath))]

save(proteinhmdbpath_metab, file="Taware/pathwaydatabase760/proteinhmdbpath_metab_760.RData")


############################   ========================= biofunction =========================   ############################ 

biofunction<- data.frame(fread("Taware/pathwaydatabase760/mbrole2_biofunctions.tsv"))
biofunctionpath_chebi = biofunction$Submitted.IDs
names(biofunctionpath_chebi) = biofunction$ID.Annotation
listbiofunctionpath = lapply(biofunctionpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listbiofunctionpath_uniq = listbiofunctionpath[which(!duplicated(listbiofunctionpath))]

## map cheiID to metabolites identifies
biofunctionpath <- lapply(listbiofunctionpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_760$metabolite_identification[which(chebi_760 %in% x )] )

biofunctionpath_metab = biofunctionpath[which(!duplicated(biofunctionpath))]

save(biofunctionpath_metab, file="Taware/pathwaydatabase760/biofunctionpath_metab_760.RData")





############################   ========================= wikipathway =========================   ############################ 
library(rWikiPathways)
listMetab <- NULL
Pathways<-listPathwayIds(organism="Homo sapiens")
wikipath_info<-data.frame(id=1:length(Pathways))
wikipath_info$names<-Pathways
#list of chebi IDs per pathway
listMetab<-apply(as.matrix(Pathways),1, function(x) getXrefList(pathway = x, systemCode="Ce"))

#remove chebiID character
listMetab<-lapply(listMetab, function(x) gsub('^\\CHEBI:|\\$', '', x))
listMetab<-lapply(listMetab, function(x) unique(x))
wikipath_info$chebi_path = listMetab
wikipath_info$path_length<-sapply(listMetab, function(x) length(x))


#map chebi IDs to  your own metabolite names
wikiMetab<-lapply(listMetab, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_760$metabolite_identification[which(sapply(chebi_760, function(x) gsub('^\\CHEBI:|\\$', '', x)) %in% x)])  

# remove empty pathways
wikipath = wikiMetab[sapply(wikiMetab,length)>0]
wikipath_uniq = wikipath[which(!duplicated(wikipath))]
save(wikipath_uniq, file="Taware/wikipath_metab_760.RData")



# ############################   ========================= smpdb_ECMDB_pathway =========================   ############################ 
# 
# smpdbecmdb <- data.frame(fread("Taware/mbrole2_smpdbecmdb.tsv"))
# smpdbecmdbpath_chebi = smpdbecmdb$Submitted.IDs
# names(smpdbecmdbpath_chebi) = smpdbecmdb$ID.Annotation
# listsmpdbecmdbpath = lapply(smpdbecmdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listsmpdbecmdbpath_uniq = listsmpdbecmdbpath[which(!duplicated(listsmpdbecmdbpath))]
# 
# ## map cheiID to metabolites identifies
# smpdbecmdbpath <- lapply(listsmpdbecmdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_760$metabolite_identification[which(chebi_760 %in% x )] )
# 
# smpdbecmdbpath_metab = smpdbecmdbpath[which(!duplicated(smpdbecmdbpath))]
# 
# save(smpdbecmdbpath_metab, file="Taware/smpdbecmdbpath_metab_760.RData")
# 
# 
# 
# 
# ############################   ========================= smpdb_YMDB_pathway =========================   ############################ 
# 
# smpdbymdb <- data.frame(fread("Taware/mbrole2_smpdbymdb.tsv"))
# smpdbymdbpath_chebi = smpdbymdb$Submitted.IDs
# names(smpdbymdbpath_chebi) = smpdbymdb$ID.Annotation
# listsmpdbymdbpath = lapply(smpdbymdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listsmpdbymdbpath_uniq = listsmpdbymdbpath[which(!duplicated(listsmpdbymdbpath))]
# 
# ## map cheiID to metabolites identifies
# smpdbymdbpath <- lapply(listsmpdbymdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_760$metabolite_identification[which(chebi_760 %in% x )] )
# 
# smpdbymdbpath_metab = smpdbymdbpath[which(!duplicated(smpdbymdbpath))]
# 
# save(smpdbymdbpath_metab, file="Taware/smpdbymdbpath_metab_760.RData")


############################   ========================= chebirole =========================   ############################ 

# chebirole<- data.frame(fread("Taware/pathwaydatabase760/mbrole2_chebirole.tsv"))
# chebirolepath_chebi = chebirole$Submitted.IDs
# names(chebirolepath_chebi) = chebirole$ID.Annotation
# listchebirolepath = lapply(chebirolepath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listchebirolepath_uniq = listchebirolepath[which(!duplicated(listchebirolepath))]
# 
# ## map cheiID to metabolites identifies
# chebirolepath <- lapply(listchebirolepath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_760$metabolite_identification[which(chebi_760 %in% x )] )
# 
# chebirolepath_metab = chebirolepath[which(!duplicated(chebirolepath))]
# 
# save(chebirolepath_metab, file="Taware/pathwaydatabase760/chebirolepath_metab_760.RData")


